﻿using Abot.Crawler;
using Abot.Poco;
using FC.Data;
using Newtonsoft.Json;
using System;

namespace FC.Tools
{
    class AbotHelper
    {
        /// <summary>
        /// 要跑的网页这里是博客园
        /// </summary>
        public static readonly Uri baseUrl = new Uri(@"https://sf.taobao.com/item_list.htm");


        static void Main(string[] args)
        {
            Console.WriteLine("Begin");
            var crawler = GetManuallyConfiguredWebCrawler();
            var result = crawler.Crawl(baseUrl);

            Console.WriteLine(result.ErrorException);
            Console.WriteLine("end");
        }


        public static IWebCrawler GetManuallyConfiguredWebCrawler()
        {
            //创建配置文件
            CrawlConfiguration config = new CrawlConfiguration();

            #region 配置

            //连接超时
            config.CrawlTimeoutSeconds = 0;
            //下载类容格式
            config.DownloadableContentTypes = "text/html, text/plain";
            //是否爬扩展页面
            config.IsExternalPageCrawlingEnabled = false;
            //是否爬扩展连接
            config.IsExternalPageLinksCrawlingEnabled = false;
            //是尊重 rebots.txt
            config.IsRespectRobotsDotTextEnabled = false;
            //是否多重复爬Uri,一般为false,但我估计太大，内存受不了，应为内存会存是否爬过的数据
            config.IsUriRecrawlingEnabled = false;
            //请求的最大线程，看IIS的支持，太大服务器受不了
            config.MaxConcurrentThreads = Environment.ProcessorCount;
            //最大爬的页码连接，如果为0就没有限制，看需求大小
            config.MaxPagesToCrawl = 1000;
            //单页面最大的爬页面量，如果为0就没有限制，基本都为0
            config.MaxPagesToCrawlPerDomain = 0;
            //每爬一个页面等好多毫秒，太快CUP会受不了
            config.MinCrawlDelayPerDomainMilliSeconds = 1000;

            #endregion

            //创建一个爬虫实例
            var crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);

            

            //是否爬该网页
            crawler.ShouldCrawlPage(ShouldCrawlPage);
            //是否爬该网页连接
            crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);
            //是否下载该页面
            crawler.ShouldDownloadPageContent(ShouldDownloadPageContent);
            //单个页面爬取开始 
            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            //单个页面爬取结束 
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedAsync;
            //页面不允许爬取事件
            crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            //页面链接不允许爬取事件
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            
            return crawler;
        }

        /// <summary>
        /// 页面不允许爬取事件
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        static void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;
            Console.WriteLine("页面不允许爬取:" + pageToCrawl.Uri.AbsoluteUri);
        }

        /// <summary>
        /// 页面链接不允许爬取事件
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        static void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e)
        {
        }

        /// <summary>
        /// 定义什么情况下 要爬取页面
        /// </summary>
        private static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext context)
        {

            return new CrawlDecision { Allow = true };
            //if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri
            //    || NewsPageRegex.IsMatch(pageToCrawl.Uri.AbsoluteUri)
            //    || NewsUrlRegex.IsMatch(pageToCrawl.Uri.AbsoluteUri))
            //{
            //    Console.WriteLine("ShouldCrawlPage true:" + pageToCrawl.Uri.AbsoluteUri);
            //      return new CrawlDecision { Allow = true };
            //}
            //else
            //{
            //    Console.WriteLine("ShouldCrawlPage false:" + pageToCrawl.Uri.AbsoluteUri);
            //    return new CrawlDecision { Allow = false, Reason = "Not match uri" };
            //}
        }

        /// <summary>
        /// 定义什么情况下 要下载该页面
        /// </summary>
        private static CrawlDecision ShouldDownloadPageContent(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            return new CrawlDecision
            {
                Allow = true
            };
        }

        /// <summary>
        /// 单个页面爬取开始
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;
            Console.WriteLine("页面爬取开始:" + pageToCrawl.Uri.AbsoluteUri);
        }

        /// <summary>
        /// 单个页面爬取结束 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        public static void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            var list = e.CrawledPage.AngleSharpHtmlDocument.QuerySelectorAll("script#sf-item-list-data");
            foreach (var item in list)
            {
                var json = item.InnerHtml.Replace("\n", "");
                SMList ll = JsonConvert.DeserializeObject<SMList>(json);
            }
            Console.WriteLine("页面爬取结束:" + e.CrawledPage.Uri.AbsoluteUri);
        }

        /// <summary>
        /// 是否爬去网页连接
        /// </summary>
        /// <param name="crawledPage"></param>
        /// <param name="crawlContext"></param>
        /// <returns></returns>
        private static CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            return new CrawlDecision { Allow = false, Reason = "不要爬取页面链接" };
        }
    }
}
